guttenplagwikiaorg_de-20200215-history
Benutzer:Kahrl/guttenplagcumul.py
!/usr/bin/env python # -*- coding: utf8 -*- from __future__ import unicode_literals from collections import defaultdict import re def split_csv_line(text): ws_pattern = re.compile(r'\s*', re.UNICODE | re.DOTALL) lex_pattern = re.compile(r'(\w+)|"(^"*)"', re.UNICODE | re.DOTALL) separators = ',' result = [] pos = 0 was_separator = False while pos < len(text): ws_match = ws_pattern.match(text, pos) assert ws_match is not None ws_end = ws_match.end() if ws_end len(text): break if textws_end in separators: if was_separator: result.append('') else: was_separator = True pos = ws_end + 1 else: if result: assert was_separator lex_match = lex_pattern.match(text, ws_end) assert lex_match is not None lex_start = lex_match.start() lex_end = lex_match.end() if textlex_start '"' and textlex_end-1 '"' and lex_end >= lex_start + 2: result.append(lex_match.group(2)) else: result.append(lex_match.group(1)) pos = lex_end was_separator = False assert not was_separator return result ### PARSING zeilenanzahl.txt ### lines_per_page = defaultdict(lambda x: 0) with open('zeilenanzahl.txt', 'r') as f: for line in f: parts = line.strip().split(':') assert len(parts) 4 total_normal = 0 total_footnote = 0 if parts2: for x in parts2.split(','): total_normal += int(x) if parts3: for x in parts3.split(','): total_footnote += int(x) lines_per_page[int(parts0)] = total_normal + total_footnote #print(repr(lines_per_page)) ### PARSING guttenplag.csv ### frags = [] with open('guttenplag.csv', 'r') as f: for line in f: parts = split_csv_line(line.decode('utf8')) #print(repr(parts)) diss_pages = parts0 diss_lines = parts1 diss_text = parts2 orig_pages = parts3 orig_lines = parts4 orig_text = parts5 fragtype = parts6 inlit = parts7 source = parts8 url = parts9 note = parts10 if diss_pages 'Seite': continue assert re.match(r'^\s*\d+\s*$', diss_pages) assert re.match(r'^\s*\d+(\s*-\s*\d+)?\s*$', diss_lines) assert fragtype.lower() != 'verdächtig' assert fragtype.lower() != 'keinplagiat' frag_page = int(diss_pages.strip()) if '-' in diss_lines: frag_first_line = int(diss_lines.split('-')0.strip()) frag_last_line = int(diss_lines.split('-')1.strip()) else: frag_first_line = int(diss_lines.strip()) frag_last_line = int(diss_lines.strip()) frags.append((frag_page, frag_first_line, frag_last_line)) ### SORTING ### def sortkey(frag): return '{0:>03}.{1:>03}.{2:>03}'.format(*frag) frags.sort(key = sortkey) ### PROCESSING ### def output_page(output_pagenum, output_plaglines): global lines_per_page if output_plaglines > 0: output_pagelines = lines_per_pageoutput_pagenum if output_plaglines*4 > output_pagelines*3: category = 'Gt75' elif output_plaglines*2 > output_pagelines: category = 'Gt50' else: category = '' print(unicode(output_pagenum) + ',' + unicode(output_pagelines) + ',' + unicode(output_plaglines) + ',' + category) if output_pagenum != 260: assert output_plaglines <= output_pagelines pagenum = 0 linecount = 0 prev_first_line = 0 prev_last_line = 0 for frag in frags: frag_page, frag_first_line, frag_last_line = frag if frag_page != pagenum: output_page(pagenum, linecount) pagenum = frag_page linecount = 0 prev_first_line = 0 prev_last_line = 0 assert frag_page pagenum # input is sorted by page assert frag_first_line >= prev_first_line # input is sorted by line (minor) assert frag_first_line <= frag_last_line if frag_last_line <= prev_last_line: # fully contained in previous fragment print('Warning: fragment is fully contained in predecessor:' + ' page ' + unicode(frag_page) + ' lines ' + unicode(frag_first_line) + '-' + unicode(frag_last_line)) pass elif frag_first_line > prev_last_line: # no overlap with previous fragment linecount += frag_last_line - frag_first_line + 1 else: # partly overlaps with previous fragment if frag_first_line < prev_last_line: print('Warning: fragment overlaps by more than one line with predecessor:' + ' page ' + unicode(frag_page) + ' lines ' + unicode(frag_first_line) + '-' + unicode(frag_last_line)) linecount += frag_last_line - prev_last_line prev_first_line = frag_first_line prev_last_line = frag_last_line output_page(pagenum, linecount)